{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 共享单车租车量预测——特征工程"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1、导入必要的工具包"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np  # 矩阵操作\n",
    "import pandas as pd # SQL数据处理"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 读取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>instant</th>\n",
       "      <th>dteday</th>\n",
       "      <th>season</th>\n",
       "      <th>yr</th>\n",
       "      <th>mnth</th>\n",
       "      <th>holiday</th>\n",
       "      <th>weekday</th>\n",
       "      <th>workingday</th>\n",
       "      <th>weathersit</th>\n",
       "      <th>temp</th>\n",
       "      <th>atemp</th>\n",
       "      <th>hum</th>\n",
       "      <th>windspeed</th>\n",
       "      <th>casual</th>\n",
       "      <th>registered</th>\n",
       "      <th>cnt</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>2011-01-01</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0.344167</td>\n",
       "      <td>0.363625</td>\n",
       "      <td>0.805833</td>\n",
       "      <td>0.160446</td>\n",
       "      <td>331</td>\n",
       "      <td>654</td>\n",
       "      <td>985</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>2011-01-02</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0.363478</td>\n",
       "      <td>0.353739</td>\n",
       "      <td>0.696087</td>\n",
       "      <td>0.248539</td>\n",
       "      <td>131</td>\n",
       "      <td>670</td>\n",
       "      <td>801</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>2011-01-03</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0.196364</td>\n",
       "      <td>0.189405</td>\n",
       "      <td>0.437273</td>\n",
       "      <td>0.248309</td>\n",
       "      <td>120</td>\n",
       "      <td>1229</td>\n",
       "      <td>1349</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>2011-01-04</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.212122</td>\n",
       "      <td>0.590435</td>\n",
       "      <td>0.160296</td>\n",
       "      <td>108</td>\n",
       "      <td>1454</td>\n",
       "      <td>1562</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>2011-01-05</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0.226957</td>\n",
       "      <td>0.229270</td>\n",
       "      <td>0.436957</td>\n",
       "      <td>0.186900</td>\n",
       "      <td>82</td>\n",
       "      <td>1518</td>\n",
       "      <td>1600</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   instant      dteday  season  yr  mnth  holiday  weekday  workingday  \\\n",
       "0        1  2011-01-01       1   0     1        0        6           0   \n",
       "1        2  2011-01-02       1   0     1        0        0           0   \n",
       "2        3  2011-01-03       1   0     1        0        1           1   \n",
       "3        4  2011-01-04       1   0     1        0        2           1   \n",
       "4        5  2011-01-05       1   0     1        0        3           1   \n",
       "\n",
       "   weathersit      temp     atemp       hum  windspeed  casual  registered  \\\n",
       "0           2  0.344167  0.363625  0.805833   0.160446     331         654   \n",
       "1           2  0.363478  0.353739  0.696087   0.248539     131         670   \n",
       "2           1  0.196364  0.189405  0.437273   0.248309     120        1229   \n",
       "3           1  0.200000  0.212122  0.590435   0.160296     108        1454   \n",
       "4           1  0.226957  0.229270  0.436957   0.186900      82        1518   \n",
       "\n",
       "    cnt  \n",
       "0   985  \n",
       "1   801  \n",
       "2  1349  \n",
       "3  1562  \n",
       "4  1600  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#dpath = \"./data/\"\n",
    "df = pd.read_csv(\"day.csv\")\n",
    "\n",
    "#显示前5行\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  数据基本信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 731 entries, 0 to 730\n",
      "Data columns (total 16 columns):\n",
      " #   Column      Non-Null Count  Dtype  \n",
      "---  ------      --------------  -----  \n",
      " 0   instant     731 non-null    int64  \n",
      " 1   dteday      731 non-null    object \n",
      " 2   season      731 non-null    int64  \n",
      " 3   yr          731 non-null    int64  \n",
      " 4   mnth        731 non-null    int64  \n",
      " 5   holiday     731 non-null    int64  \n",
      " 6   weekday     731 non-null    int64  \n",
      " 7   workingday  731 non-null    int64  \n",
      " 8   weathersit  731 non-null    int64  \n",
      " 9   temp        731 non-null    float64\n",
      " 10  atemp       731 non-null    float64\n",
      " 11  hum         731 non-null    float64\n",
      " 12  windspeed   731 non-null    float64\n",
      " 13  casual      731 non-null    int64  \n",
      " 14  registered  731 non-null    int64  \n",
      " 15  cnt         731 non-null    int64  \n",
      "dtypes: float64(4), int64(11), object(1)\n",
      "memory usage: 91.5+ KB\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. 特征工程"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.1. 数据去噪"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.2 数据分离"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 731 entries, 0 to 730\n",
      "Data columns (total 11 columns):\n",
      " #   Column      Non-Null Count  Dtype  \n",
      "---  ------      --------------  -----  \n",
      " 0   season      731 non-null    int64  \n",
      " 1   yr          731 non-null    int64  \n",
      " 2   mnth        731 non-null    int64  \n",
      " 3   holiday     731 non-null    int64  \n",
      " 4   weekday     731 non-null    int64  \n",
      " 5   workingday  731 non-null    int64  \n",
      " 6   weathersit  731 non-null    int64  \n",
      " 7   temp        731 non-null    float64\n",
      " 8   atemp       731 non-null    float64\n",
      " 9   hum         731 non-null    float64\n",
      " 10  windspeed   731 non-null    float64\n",
      "dtypes: float64(4), int64(7)\n",
      "memory usage: 62.9 KB\n"
     ]
    }
   ],
   "source": [
    "#删掉instant,dteday，casual,registered 并分离出标签y:cnt\n",
    "y = df['cnt']\n",
    "X = df.drop(['instant', 'dteday', 'casual','registered', 'cnt'], axis = 1) #按列操作\n",
    "\n",
    "X.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.3 离散型特征编码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['yr', 'holiday', 'workingday', 'temp', 'atemp', 'hum', 'windspeed'], dtype='object')\n"
     ]
    }
   ],
   "source": [
    "#类别型特征 \n",
    "tar_types = ['season','mnth','weekday','weathersit' ];\n",
    "X_cats = {}\n",
    "\n",
    "for tar_type in tar_types:\n",
    "    X[tar_type].astype(\"object\")\n",
    "    X_cats[tar_type] = pd.get_dummies(X[tar_type], prefix=tar_type)\n",
    "    X = X.drop(tar_type, axis = 1)\n",
    "\n",
    "features_names = X.columns\n",
    "print(features_names)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 数值特征标准化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>temp</th>\n",
       "      <th>atemp</th>\n",
       "      <th>hum</th>\n",
       "      <th>windspeed</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.355170</td>\n",
       "      <td>0.373517</td>\n",
       "      <td>0.828620</td>\n",
       "      <td>0.284606</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.379232</td>\n",
       "      <td>0.360541</td>\n",
       "      <td>0.715771</td>\n",
       "      <td>0.466215</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.171000</td>\n",
       "      <td>0.144830</td>\n",
       "      <td>0.449638</td>\n",
       "      <td>0.465740</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.175530</td>\n",
       "      <td>0.174649</td>\n",
       "      <td>0.607131</td>\n",
       "      <td>0.284297</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.209120</td>\n",
       "      <td>0.197158</td>\n",
       "      <td>0.449313</td>\n",
       "      <td>0.339143</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       temp     atemp       hum  windspeed\n",
       "0  0.355170  0.373517  0.828620   0.284606\n",
       "1  0.379232  0.360541  0.715771   0.466215\n",
       "2  0.171000  0.144830  0.449638   0.465740\n",
       "3  0.175530  0.174649  0.607131   0.284297\n",
       "4  0.209120  0.197158  0.449313   0.339143"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 数据标准化\n",
    "#from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "\n",
    "mm_X = MinMaxScaler()\n",
    "numerical_features = ['temp','atemp','hum','windspeed']\n",
    "temp = mm_X.fit_transform(X[numerical_features])\n",
    "\n",
    "#mm_Y = MinMaxScaler()\n",
    "#y = mm_Y.fit_transform(y.values.reshape(-1, 1))\n",
    "\n",
    "X_train_num = pd.DataFrame(data=temp, columns=numerical_features, index=X.index)\n",
    "X_train_num.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['yr', 'holiday', 'workingday', 'temp', 'atemp', 'hum', 'windspeed'], dtype='object')\n"
     ]
    }
   ],
   "source": [
    "print(features_names)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. 保存特征工程的结果到文件，供机器学习模型使用"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "OK\n"
     ]
    }
   ],
   "source": [
    "\n",
    "fe_data = pd.concat([X['yr'], X['holiday'], X['workingday'], X_train_num, X_cats['season'], X_cats['mnth'], X_cats['weekday'], X_cats['weathersit'],], axis = 1, ignore_index=False)\n",
    "#['season','yr', 'mnth','holiday','weekday','workingday','weathersit' ]\n",
    "#加上标签y\n",
    "fe_data[\"cnt\"] = y\n",
    "\n",
    "#保存结果到文件\n",
    "fe_data.to_csv('FE_bikeshare_cnt.csv', index=False)\n",
    "print(\"OK\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
